In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf 
import statsmodels.api as sm
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
import warnings
warnings.filterwarnings("ignore")

Cleaning Data¶

Crop statistics FAO dataset¶

In [2]:
df_crops = pd.read_csv('Crops_AllData_Normalized.csv', encoding = 'latin-1')
df_crops.head()
Out[2]:
Area Code Area Item Code Item Element Code Element Year Code Year Unit Value Flag
0 2 Afghanistan 221 Almonds, with shell 5312 Area harvested 1975 1975 ha 0.0 F
1 2 Afghanistan 221 Almonds, with shell 5312 Area harvested 1976 1976 ha 5900.0 F
2 2 Afghanistan 221 Almonds, with shell 5312 Area harvested 1977 1977 ha 6000.0 F
3 2 Afghanistan 221 Almonds, with shell 5312 Area harvested 1978 1978 ha 6000.0 F
4 2 Afghanistan 221 Almonds, with shell 5312 Area harvested 1979 1979 ha 6000.0 F
In [3]:
df_avocados_yield = df_crops[(df_crops['Item'] == 'Avocados') & (df_crops['Element'] == 'Yield')]
# df_avocados_area = df_crops[(df_crops['Item'] == 'Avocados') & (df_crops['Element'] == 'Area harvested')]
In [4]:
df_avocados_yield = df_avocados_yield.dropna()
df_avocados_yield = df_avocados_yield.drop_duplicates()
In [5]:
avocados = df_avocados_yield.drop(columns = ['Area Code', 'Item Code', 'Item', 'Element Code', 
                                            'Element', 'Year Code', 'Unit', 'Flag'])
# avocados = avocados.pivot(index = 'Year', columns = 'Area', values = 'Value')
avocados = avocados.rename(columns={'Value':'Yield'})
avocados.head()
Out[5]:
Area Year Yield
45696 Argentina 1985 57778.0
45697 Argentina 1986 60000.0
45698 Argentina 1987 60606.0
45699 Argentina 1988 60606.0
45700 Argentina 1989 60606.0

FAOSTAT Temperature Change Dataset¶

In [6]:
df_temperature = pd.read_csv('FAOSTAT_data_1-10-2022.csv')
df_temperature = df_temperature.rename(columns={'Value':'temp_change'})
df_temperature.head()
Out[6]:
Domain Code Domain Area Code (FAO) Area Element Code Element Months Code Months Year Code Year Unit temp_change Flag Flag Description
0 ET Temperature change 2 Afghanistan 7271 Temperature change 7001 January 1961 1961 ?C 0.746 Fc Calculated data
1 ET Temperature change 2 Afghanistan 7271 Temperature change 7001 January 1962 1962 ?C 0.009 Fc Calculated data
2 ET Temperature change 2 Afghanistan 7271 Temperature change 7001 January 1963 1963 ?C 2.695 Fc Calculated data
3 ET Temperature change 2 Afghanistan 7271 Temperature change 7001 January 1964 1964 ?C -5.277 Fc Calculated data
4 ET Temperature change 2 Afghanistan 7271 Temperature change 7001 January 1965 1965 ?C 1.827 Fc Calculated data
In [7]:
df_temp = df_temperature.drop(columns=['Domain Code', 'Domain', 'Area Code (FAO)', 'Months Code', 'Year Code',
                                       'Element Code', 'Element', 'Unit', 'Flag', 'Flag Description'])

df_temp = df_temp.dropna()
df_temp = df_temp.drop_duplicates()
df_temp
Out[7]:
Area Months Year temp_change
0 Afghanistan January 1961 0.746
1 Afghanistan January 1962 0.009
2 Afghanistan January 1963 2.695
3 Afghanistan January 1964 -5.277
4 Afghanistan January 1965 1.827
... ... ... ... ...
229920 Zimbabwe Meteorological year 2016 1.470
229921 Zimbabwe Meteorological year 2017 0.443
229922 Zimbabwe Meteorological year 2018 0.747
229923 Zimbabwe Meteorological year 2019 1.359
229924 Zimbabwe Meteorological year 2020 0.820

222012 rows × 4 columns

In [8]:
df_temp_seasons = df_temp[(df_temp['Months'] == 'Dec?Jan?Feb') | (df_temp['Months'] == 'Mar?Apr?May') |
                         (df_temp['Months'] == 'Jun?Jul?Aug') | (df_temp['Months'] == 'Sep?Oct?Nov')]

df_temp_seasons = df_temp_seasons.replace({'Mar?Apr?May': 'Spring', 'Jun?Jul?Aug':'Summer', 
                                           'Sep?Oct?Nov':'Fall','Dec?Jan?Feb': 'Winter'})

df_temp_seasons = df_temp_seasons.rename(columns={'Months':'Season'})
df_temp_seasons.head()
Out[8]:
Area Season Year temp_change
720 Afghanistan Winter 1961 -0.751
721 Afghanistan Winter 1962 0.985
722 Afghanistan Winter 1963 1.931
723 Afghanistan Winter 1964 -2.056
724 Afghanistan Winter 1965 -0.669
In [9]:
df_temp_year = df_temp[df_temp['Months'] == 'Meteorological year']
df_temp_year = df_temp_year.drop(columns=['Months'])
df_temp_year.head()
Out[9]:
Area Year temp_change
960 Afghanistan 1961 -0.121
961 Afghanistan 1962 -0.171
962 Afghanistan 1963 0.841
963 Afghanistan 1964 -0.779
964 Afghanistan 1965 -0.254

FAOSTAT Fertilizers by Nutrients Dataset¶

In [10]:
df_fertilizer = pd.read_csv('Inputs_FertilizersNutrient_E_All_Data_NOFLAG.csv', encoding = 'latin-1')
df_fertilizer= df_fertilizer[df_fertilizer['Element'] == 'Agricultural Use']

df_fertilizer.head()
Out[10]:
Area Code Area Code (M49) Area Item Code Item Element Code Element Unit Y1961 Y1962 ... Y2011 Y2012 Y2013 Y2014 Y2015 Y2016 Y2017 Y2018 Y2019 Y2020
3 2 '004 Afghanistan 3102 Nutrient nitrogen N (total) 5157 Agricultural Use tonnes 1000.0 1000.0 ... 22970.63 17065.25 34903.79 73872.84 92515.97 96670.85 140775.85 46613.50 59702.92 43141.63
7 2 '004 Afghanistan 3103 Nutrient phosphate P2O5 (total) 5157 Agricultural Use tonnes 100.0 100.0 ... 6845.57 1261.44 3020.03 5121.35 10886.87 8991.27 16656.69 12319.04 25761.88 24956.38
11 2 '004 Afghanistan 3104 Nutrient potash K2O (total) 5157 Agricultural Use tonnes NaN NaN ... 0.00 196.78 88.11 6.67 61.84 71.73 31.48 0.62 0.00 127.77
15 3 '008 Albania 3102 Nutrient nitrogen N (total) 5157 Agricultural Use tonnes 3187.0 2000.0 ... 36892.70 34944.13 30711.91 30711.76 36803.92 42896.08 39184.92 24769.07 38329.56 32578.70
19 3 '008 Albania 3103 Nutrient phosphate P2O5 (total) 5157 Agricultural Use tonnes 2433.0 2500.0 ... 23341.22 22165.38 22906.47 22906.54 28725.33 34544.12 21263.69 13358.07 15087.82 22898.59

5 rows × 68 columns

In [11]:
df_fert = df_fertilizer.drop(columns=['Area Code', 'Area Code (M49)', 'Item Code',
                                       'Element Code', 'Element', 'Unit', 'Item'])


df_fert = df_fert.groupby('Area').mean().reset_index()
df_fert = df_fert.rename(columns={x:x[1:] for x in df_fert.columns if 'Y' in x})

df_fert = df_fert.melt(id_vars=['Area'], value_vars = [str(n) for n in range(1961, 2020 + 1)], 
                       var_name= 'Year', value_name= 'avg_fert')
df_fert = df_fert.sort_values(['Area', 'Year'])
df_fert = df_fert.fillna(0)
df_fert['Year'] = df_fert['Year'].astype(int)

df_fert
Out[11]:
Area Year avg_fert
0 Afghanistan 1961 550.000000
209 Afghanistan 1962 550.000000
418 Afghanistan 1963 550.000000
627 Afghanistan 1964 550.000000
836 Afghanistan 1965 550.000000
... ... ... ...
11703 Zimbabwe 2016 43433.333333
11912 Zimbabwe 2017 48800.000000
12121 Zimbabwe 2018 44266.666667
12330 Zimbabwe 2019 44266.666667
12539 Zimbabwe 2020 44266.666667

12540 rows × 3 columns

FAOSTAT Emissions Dataset¶

In [12]:
df_emission = pd.read_csv('Emissions_Totals_E_All_Data_(Normalized).csv', encoding = 'latin-1', low_memory = False)
df_ch4 = df_emission[df_emission['Element'] == 'Emissions (CH4)']
df_ch4 = df_ch4.drop(columns=['Area Code', 'Area Code (M49)', 'Item Code', 'Item',
                                       'Element Code', 'Element', 'Unit', 'Flag', 'Note'])

df_ch4 = df_ch4.groupby(['Area', 'Year'])['Value'].sum().reset_index()
df_ch4 = df_ch4.rename(columns={'Value':'CH4'})
df_ch4
Out[12]:
Area Year CH4
0 Afghanistan 1961 571.5586
1 Afghanistan 1962 581.7342
2 Afghanistan 1963 604.0564
3 Afghanistan 1964 613.8140
4 Afghanistan 1965 627.8378
... ... ... ...
15166 Zimbabwe 2018 2821.0779
15167 Zimbabwe 2019 2846.4932
15168 Zimbabwe 2020 2686.0864
15169 Zimbabwe 2030 2609.5168
15170 Zimbabwe 2050 3442.7856

15171 rows × 3 columns

CO2 Dataset¶

In [13]:
df_co2 = pd.read_csv("CO2 emission by countries.csv", encoding = "ISO-8859-1")

df_co2.rename(columns = {"CO2 emission (Tons)":"co2_emission_tons", "Density(km2)":"CO2_density",
                         "% of World":"percentage_of_world", "Country":"country"}, inplace = True)
In [14]:
df_co2["CO2_density"] = df_co2["CO2_density"].str.replace("/km²", "", regex=True)
df_co2["CO2_density"] = df_co2["CO2_density"].str.replace(',','.', regex=True)
df_co2["CO2_density"] = df_co2["CO2_density"].astype(float)

df_co2 = df_co2.drop(columns=['Code', 'Calling Code', 'co2_emission_tons', 'Population(2022)',
                                       'percentage_of_world', 'Area'])

df_co2 = df_co2.replace({'Congo': 'Democratic Republic of the Congo'})

df_co2.head()
Out[14]:
country Year CO2_density
0 Afghanistan 1750 63.0
1 Afghanistan 1751 63.0
2 Afghanistan 1752 63.0
3 Afghanistan 1753 63.0
4 Afghanistan 1754 63.0

Merge Datasets¶

In [15]:
df_merge = avocados.merge(df_temp_year, on=['Area', 'Year']).merge(
    df_fert, on=['Area', 'Year']).merge(df_ch4, on=['Area', 'Year'])

df_merge = df_merge.rename(columns={'Area':'country'})
df_main = pd.merge(df_merge, df_co2, on=['country', 'Year'])
df_main = df_main.replace({'Bolivia (Plurinational State of)': 'Bolivia', 
                           'Venezuela (Bolivarian Republic of)':'Venezuela', 
                           'United States of America':'United States',
                           'China, mainland':'China'})
df_main = df_main.dropna()

df_main
Out[15]:
country Year Yield temp_change avg_fert CH4 CO2_density
0 Argentina 1985 57778.0 0.245 54200.000000 6810.4064 16.0
1 Argentina 1986 60000.0 0.405 50381.333333 6649.3065 16.0
2 Argentina 1987 60606.0 0.391 55966.666667 6454.3768 16.0
3 Argentina 1988 60606.0 -0.249 54166.666667 6591.4282 16.0
4 Argentina 1989 60606.0 0.565 51033.333333 6639.3964 16.0
... ... ... ... ... ... ... ...
2268 Zimbabwe 2015 58020.0 1.226 26200.000000 2895.5529 42.0
2269 Zimbabwe 2016 58065.0 1.470 43433.333333 2704.1883 42.0
2270 Zimbabwe 2017 58027.0 0.443 48800.000000 3114.6840 42.0
2271 Zimbabwe 2018 58242.0 0.747 44266.666667 2821.0779 42.0
2272 Zimbabwe 2019 58003.0 1.359 44266.666667 2846.4932 42.0

2126 rows × 7 columns

Data Visualization¶

In [16]:
fig = px.line(df_main, x = 'Year', y = 'temp_change', color = 'country')
fig.show()
In [17]:
fig = px.line(df_main, x = 'Year', y = 'Yield', color = 'country')
fig.show()
In [18]:
total_yield = df_main.groupby(['Year'])['Yield'].sum().reset_index()
avg_temp = df_main.groupby(['Year'])['temp_change'].mean().reset_index()

fig3, ax_total = plt.subplots()
ax_total.set_xlabel('Years')
ax_total.set_ylabel('Yield (hg/ha)')
plot1 = ax_total.plot(total_yield.Year, total_yield.Yield, color = 'blue', label = 'Avocado Yield')

ax_avg = ax_total.twinx()
plot2 = ax_avg.plot(avg_temp.Year, avg_temp.temp_change, color = 'red', label = 'Temperature Change')
ax_avg.set_ylabel('Celsius')

lines, labels = ax_total.get_legend_handles_labels()
lines2, labels2 = ax_avg.get_legend_handles_labels()
ax_total.legend(lines + lines2, labels + labels2)
figure = plt.gcf()
figure.set_size_inches(12, 7)
plt.show()
In [19]:
df_country = px.data.gapminder()
df_country_code = df_country[['iso_alpha', 'country']]
df_main_cc = df_main.merge(df_country_code, on = ['country'])
df_main_cc.sort_values('Year')
global_yield = df_main_cc.groupby(['country','Year', 'iso_alpha'])['Yield'].sum().reset_index().sort_values('Year')

fig = px.scatter_geo(global_yield, locations = 'iso_alpha', color = 'Yield', color_continuous_scale = 'plasma', 
                     size = 'Yield', size_max = 30, hover_name = 'country', hover_data = dict(iso_alpha=None),
                     animation_frame = 'Year', projection = 'natural earth', 
                     range_color = [global_yield.Yield.min(), global_yield.Yield.max()], 
                     labels = dict(Yield = 'Yield (hg/ha)', Year = 'Year'))

fig.update_layout(    
    title = 'Worldwide total avocado yield from 1961 to 2019',
    title_x = 0.47, title_y = 0.95,
    title_xanchor = 'center', title_yanchor = 'top',
    dragmode = False, width = 1000, height = 600)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 250
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 80

fig.show()
In [20]:
fig = px.choropleth(df_main_cc, locations = 'iso_alpha', animation_frame = 'Year', color = 'temp_change', 
                    color_continuous_scale = 'balance', range_color = [-2, 2.5], hover_name = 'country', 
                    hover_data = dict(iso_alpha = None), labels = dict(Year = 'Year', 
                                                                     temp_change = 'Temperature Change (°C)'))

fig.update_layout(title = 'Global Temperature Change by Countries from 1961 to 2019',
                  title_x = 0.45, title_y = 0.95, title_xanchor = 'center', title_yanchor = 'top',
                  dragmode = False, width = 1000, height = 600)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 250
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 80

fig.show()

Data Analysis¶

In [21]:
# distribution of avocado yield (column = 'Value')
df_main['Yield'].plot(kind='hist', bins=100)
Out[21]:
<AxesSubplot:ylabel='Frequency'>
In [22]:
# log-transformed Yield
df_main['ln_yield'] = np.log(df_main['Yield'])
df_main['ln_yield'].plot(kind='hist', bins=100)
Out[22]:
<AxesSubplot:ylabel='Frequency'>
In [23]:
# temperature change distribution
df_main['temp_change'].plot(kind='hist', bins=100)
Out[23]:
<AxesSubplot:ylabel='Frequency'>
In [24]:
df_main['avg_fert'].plot(kind='hist', bins=100)
Out[24]:
<AxesSubplot:ylabel='Frequency'>
In [25]:
df_main['avg_fert'] = df_main['avg_fert'].replace(to_replace=0, method='ffill')
df_main['ln_fert'] = np.log(df_main['avg_fert'])
df_main['ln_fert'].plot(kind='hist', bins=100)
Out[25]:
<AxesSubplot:ylabel='Frequency'>
In [26]:
df_main['CH4'].plot(kind='hist', bins=100)
Out[26]:
<AxesSubplot:ylabel='Frequency'>
In [27]:
df_main['ln_ch4'] = np.log(df_main['CH4'])
df_main['ln_ch4'].plot(kind='hist', bins=100)
Out[27]:
<AxesSubplot:ylabel='Frequency'>
In [28]:
df_main['CO2_density'].plot(kind='hist', bins=30)
Out[28]:
<AxesSubplot:ylabel='Frequency'>
In [43]:
df_main.corr()
Out[43]:
Year Yield temp_change avg_fert CH4 CO2_density ln_yield ln_fert ln_ch4 ln_co2
Year 1.000000 0.036409 0.714850 0.121901 0.195074 0.078808 0.110829 0.069395 0.333730 0.016503
Yield 0.036409 1.000000 0.061455 0.002804 0.024826 0.068407 0.860336 -0.134476 -0.080331 0.162114
temp_change 0.714850 0.061455 1.000000 0.155291 0.189343 0.030131 0.129418 0.099007 0.292832 0.026070
avg_fert 0.121901 0.002804 0.155291 1.000000 0.903851 -0.031747 0.044396 0.388765 0.365501 0.014815
CH4 0.195074 0.024826 0.189343 0.903851 1.000000 -0.094636 0.082717 0.472949 0.521271 -0.084319
CO2_density 0.078808 0.068407 0.030131 -0.031747 -0.094636 1.000000 0.106073 -0.223855 -0.323180 0.821112
ln_yield 0.110829 0.860336 0.129418 0.044396 0.082717 0.106073 1.000000 -0.026004 0.072299 0.191603
ln_fert 0.069395 -0.134476 0.099007 0.388765 0.472949 -0.223855 -0.026004 1.000000 0.774744 -0.143744
ln_ch4 0.333730 -0.080331 0.292832 0.365501 0.521271 -0.323180 0.072299 0.774744 1.000000 -0.308405
ln_co2 0.016503 0.162114 0.026070 0.014815 -0.084319 0.821112 0.191603 -0.143744 -0.308405 1.000000
In [29]:
df_main['ln_co2'] = np.log(df_main['CO2_density'])
df_main['ln_co2'].plot(kind='hist', bins=30)
Out[29]:
<AxesSubplot:ylabel='Frequency'>

Regression¶

In [30]:
lr_result = smf.ols('ln_yield ~ temp_change', data = df_main).fit()
lr_result.summary()
Out[30]:
OLS Regression Results
Dep. Variable: ln_yield R-squared: 0.017
Model: OLS Adj. R-squared: 0.016
Method: Least Squares F-statistic: 36.18
Date: Tue, 20 Dec 2022 Prob (F-statistic): 2.11e-09
Time: 22:53:29 Log-Likelihood: -2134.2
No. Observations: 2126 AIC: 4272.
Df Residuals: 2124 BIC: 4284.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 11.0334 0.020 546.177 0.000 10.994 11.073
temp_change 0.1661 0.028 6.015 0.000 0.112 0.220
Omnibus: 180.843 Durbin-Watson: 0.162
Prob(Omnibus): 0.000 Jarque-Bera (JB): 496.738
Skew: -0.461 Prob(JB): 1.36e-108
Kurtosis: 5.181 Cond. No. 2.57


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [31]:
fig = sm.graphics.plot_ccpr(lr_result, 'temp_change')
In [32]:
mlr_result = smf.ols('ln_yield ~ temp_change + ln_fert + ln_ch4 + ln_co2 + Year', data = df_main).fit()
mlr_result.summary()
Out[32]:
OLS Regression Results
Dep. Variable: ln_yield R-squared: 0.087
Model: OLS Adj. R-squared: 0.085
Method: Least Squares F-statistic: 40.49
Date: Tue, 20 Dec 2022 Prob (F-statistic): 7.00e-40
Time: 22:53:29 Log-Likelihood: -2055.2
No. Observations: 2126 AIC: 4122.
Df Residuals: 2120 BIC: 4156.
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 14.5657 2.555 5.700 0.000 9.554 19.577
temp_change 0.1064 0.038 2.780 0.005 0.031 0.181
ln_fert -0.0654 0.009 -7.480 0.000 -0.083 -0.048
ln_ch4 0.0985 0.011 8.902 0.000 0.077 0.120
ln_co2 0.1558 0.013 11.543 0.000 0.129 0.182
Year -0.0021 0.001 -1.640 0.101 -0.005 0.000
Omnibus: 118.536 Durbin-Watson: 0.167
Prob(Omnibus): 0.000 Jarque-Bera (JB): 328.232
Skew: -0.273 Prob(JB): 5.31e-72
Kurtosis: 4.846 Cond. No. 3.69e+05


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.69e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [33]:
fig = sm.graphics.plot_ccpr(mlr_result, 'temp_change')
In [34]:
countries = df_main['country'].unique().tolist()

reg_dict, reg_list = {}, []
country_list = [str(name) for name in countries]
for country in countries:
    country_data = df_main[df_main['country'] == str(country)]
    model = smf.ols('ln_yield ~ temp_change + Year + ln_fert + ln_ch4 + ln_co2', data=country_data).fit()
    reg_list.append(model)
# print(reg_list[0].summary())
for i in range(len(country_list)):
    reg_dict[country_list[i]] = reg_list[i]
# print(reg_dict['Argentina'].summary())
In [35]:
fig, ax = plt.subplots(10, 5, figsize = (36, 60))
row_counter, column_counter = 0, 0
for country in country_list:
    if column_counter < 4:
        sm.graphics.plot_ccpr(reg_dict[str(country)], 'temp_change', ax[row_counter, column_counter])
        ax[row_counter, column_counter].set_title(str(country))
        column_counter += 1
    else:
        sm.graphics.plot_ccpr(reg_dict[str(country)], 'temp_change', ax[row_counter, column_counter])
        ax[row_counter, column_counter].set_title(str(country))
        row_counter += 1
        column_counter = 0
# for i in range(2, 5):
#     fig.delaxes(ax[13][i]) 
# plt.savefig('regmap.jpg', dpi = 200)
#     print(row_counter, column_counter)
In [36]:
print(reg_dict['Mexico'].summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               ln_yield   R-squared:                       0.719
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     34.47
Date:                Tue, 20 Dec 2022   Prob (F-statistic):           2.77e-14
Time:                        22:53:34   Log-Likelihood:                 63.291
No. Observations:                  59   AIC:                            -116.6
Df Residuals:                      54   BIC:                            -106.2
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.0871      0.252      0.346      0.731      -0.417       0.592
temp_change     0.0628      0.049      1.277      0.207      -0.036       0.161
Year            0.0071      0.003      2.770      0.008       0.002       0.012
ln_fert        -0.3434      0.034     -9.966      0.000      -0.412      -0.274
ln_ch4          0.0178      0.023      0.779      0.440      -0.028       0.064
ln_co2          0.3634      1.051      0.346      0.731      -1.743       2.470
==============================================================================
Omnibus:                        1.078   Durbin-Watson:                   0.827
Prob(Omnibus):                  0.583   Jarque-Bera (JB):                0.985
Skew:                          -0.114   Prob(JB):                        0.611
Kurtosis:                       2.409   Cond. No.                          inf
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is      0. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Conclusion¶

In [37]:
rsquare_list, adj_r_list, pvalue_list = ([] for i in range(3))
for country in countries:
    rsquare_list.append(reg_dict[str(country)].rsquared)
    adj_r_list.append(reg_dict[str(country)].rsquared_adj)
    pvalue_list.append(round(reg_dict[str(country)].pvalues.loc['temp_change'], 4))
    
df_country_stats = pd.DataFrame({'country': countries, 'r_square': rsquare_list,
                                 'adj_r_square': adj_r_list, 'p_value': pvalue_list})
In [38]:
country_high = df_country_stats[(df_country_stats['p_value'] < 0.05) 
                                   & (df_country_stats['r_square'] > 0.3)]
top10countries = country_high.sort_values('r_square', ascending = False).head(10)
top10countries
Out[38]:
country r_square adj_r_square p_value
34 New Zealand 0.927074 0.919959 0.0013
6 Cameroon 0.915065 0.908773 0.0149
30 Madagascar 0.470415 0.424364 0.0003
In [39]:
df_main.groupby('country')['Yield'].sum().sort_values(ascending = False).reset_index().head(10)
Out[39]:
country Yield
0 Samoa 15777857.0
1 Cuba 10736542.0
2 Kenya 8346142.0
3 Dominican Republic 8075604.0
4 El Salvador 6253056.0
5 Brazil 6138471.0
6 Mexico 5723626.0
7 Guatemala 5684889.0
8 Morocco 5652491.0
9 Peru 5639749.0
In [40]:
top10countries_cc = top10countries.merge(df_country_code, on = ['country'])
In [41]:
fig = px.choropleth(top10countries_cc, locations = 'iso_alpha',
                    color = 'r_square', 
                    hover_name = 'country', 
                    color_continuous_scale = px.colors.sequential.Plasma)
fig.show()